--- title: Customer segmentation toolkit keywords: fastai sidebar: home_sidebar summary: "Data transformations toolkit made by Team #2 for the MLOps Engineering Lab #2 "Feature Store for ML"." description: "Data transformations toolkit made by Team #2 for the MLOps Engineering Lab #2 "Feature Store for ML"." nb_path: "nbs/index.ipynb" ---
{% raw %}
{% endraw %}

Installation

pip install -U customer-segmentation-toolkit

Usage

{% raw %}
import pandas as pd
from pathlib import Path
{% endraw %}

01. Load and split dataset

{% raw %}
import datetime
from customer_segmentation_toolkit.data_zoo import download_data_csv
from customer_segmentation_toolkit.load_split import split_by_invoice_date

ONLINEOFFLINE_DATE_SPLIT = datetime.date(2011,10,1)

# Loading original dataset from the remote data zoo
df = download_data_csv('/data/data.csv')
print(f'Downloaded dataset, shape: {df.shape}')

# Splitting dataset to offline and online parts
df_offline, df_online = split_by_invoice_date(df, ONLINEOFFLINE_DATE_SPLIT)

print(f'Offline dataset shape: {df_offline.shape}')
print(f'Offline invoices: from {df_offline["InvoiceDate"].min()} to {df_offline["InvoiceDate"].max()}')

print(f'Online dataset shape: {df_online.shape}')
print(f'Online invoices: from {df_online["InvoiceDate"].min()} to {df_online["InvoiceDate"].max()}')
Loaded dataset, shape: (541909, 8)
Offline dataset shape: (370931, 8)
Offline invoices: from 2010-12-01 08:26:00 to 2011-09-30 17:22:00
Online dataset shape: (170978, 8)
Online invoices: from 2011-10-02 10:32:00 to 2011-12-09 12:50:00
{% endraw %} {% raw %}
OUTPUT = Path(f'../data/output/01_data_split_offline_online')
OUTPUT.mkdir(exist_ok=True, parents=True)

df_offline.to_csv(f'{OUTPUT}/no_live_data.csv', index=False)
df_online.to_csv(f'{OUTPUT}/raw_live_data.csv', index=False)
Path(f'{OUTPUT}/onlineoffline_date_split.txt').write_text(str(ONLINEOFFLINE_DATE_SPLIT))

print(f'Output data saved to {OUTPUT}: {[p.name for p in Path(OUTPUT).iterdir()]}')
Output data saved to ../data/output/01_data_split_offline_online: ['onlineoffline_date_split.txt', 'no_live_data.csv', 'raw_live_data.csv']
{% endraw %}

02. Clean dataset rows

{% raw %}
from customer_segmentation_toolkit.load_split import load_data_csv
from customer_segmentation_toolkit.clean_rows import clean_data_rows

# Loading raw offline dataset (from a local path)
df = load_data_csv('../data/output/01_data_split_offline_online/no_live_data.csv')
print(f'Loaded raw offline dataset, shape: {df.shape}')

# Cleaning the dataset
df_cleaned = clean_data_rows(df)
print(f'Cleaned offline dataset shape: {df.shape}')
Loaded raw offline dataset, shape: (370931, 8)
Cleaned offline dataset shape: (370931, 8)
{% endraw %} {% raw %}
OUTPUT = Path(f'../data/output/02_data_clean_rows')
OUTPUT.mkdir(exist_ok=True, parents=True)

df_cleaned.to_csv(f'{OUTPUT}/no_live_data__cleaned.csv', index=False)

print(f'Output data saved to {OUTPUT}: {[p.name for p in Path(OUTPUT).iterdir()]}')
Output data saved to ../data/output/02_data_clean_rows: ['no_live_data__cleaned.csv']
{% endraw %}

03. Analyse purchases

{% raw %}
import datetime
from customer_segmentation_toolkit.load_split import load_data_csv
from customer_segmentation_toolkit.analyse_purchases import build_product_list

N_PURCHASE_CLUSTERS = 5
TRAINTEST_DATE_SPLIT = datetime.date(2011,8,1)

# Loading cleaned dataset
df_cleaned = load_data_csv('../data/output/02_data_clean_rows/no_live_data__cleaned.csv')
print(f'Loaded cleaned offline dataset, shape: {df_cleaned.shape}')

list_products = build_product_list(df_cleaned)
print(f'Built list of products:')
print(pd.DataFrame(list_products).head())
print('...')
Loaded cleaned offline dataset, shape: (263815, 10)
Built list of products:
         0    1
0    heart  251
1  vintage  195
2      set  194
3      bag  158
4      box  147
...
{% endraw %} {% raw %}
from customer_segmentation_toolkit.analyse_purchases import build_keywords_matrix

# Building keywords count matrix
THRESHOLD = [0, 1, 2, 3, 5, 10]
matrix = build_keywords_matrix(df_cleaned, list_products, THRESHOLD)
print(f'Built keywords count matrix (shape: {matrix.shape}):')
print(matrix.head())
Built keywords count matrix (shape: (3662, 188)):
   heart  vintage  set  bag  box  glass  christmas  design  candle  flower  \
0      1        0    0    0    0      0          0       0       0       0   
1      0        0    0    0    0      0          0       0       0       0   
2      1        0    0    0    0      0          0       0       0       0   
3      0        0    0    0    0      0          0       0       0       0   
4      1        0    0    0    0      0          0       0       0       0   

   ...  medium  hen  wallet  point  0<.<1  1<.<2  2<.<3  3<.<5  5<.<10  .>10  
0  ...       0    0       0      0      0      0      1      0       0     0  
1  ...       0    0       0      0      0      0      0      1       0     0  
2  ...       0    0       0      0      0      0      0      1       0     0  
3  ...       0    0       0      0      0      0      0      1       0     0  
4  ...       0    0       0      0      0      0      0      1       0     0  

[5 rows x 188 columns]
{% endraw %} {% raw %}
from customer_segmentation_toolkit.analyse_purchases import compute_purchase_clusters

# Computing purchases clusters via Kmeans
clusters = compute_purchase_clusters(matrix, N_PURCHASE_CLUSTERS)
print(f'Built purchase clusters:')
print(pd.Series(clusters).value_counts())
Built purchase clusters:
1    980
3    911
4    638
0    567
2    566
dtype: int64
{% endraw %} {% raw %}
from sklearn.metrics import silhouette_samples, silhouette_score
from customer_segmentation_toolkit.analyse_purchases import plot_silhouette

silhouette_avg = silhouette_score(matrix, clusters)
sample_silhouette_values = silhouette_samples(matrix, clusters)
# Plotting silhouette values
plot_silhouette(N_PURCHASE_CLUSTERS, [-0.07, 0.33], len(matrix), sample_silhouette_values, clusters)
2021-05-27T20:45:15.278818 image/svg+xml Matplotlib v3.4.2, https://matplotlib.org/
{% endraw %} {% raw %}
from customer_segmentation_toolkit.analyse_purchases import add_purchase_clusters_info

# Constructing the result DataFrame
df_with_clusters = add_purchase_clusters_info(df_cleaned, clusters, N_PURCHASE_CLUSTERS)
print(f'Added purchase clusters info to the offline cleaned dataset:')
print(f'Shape: {df_with_clusters.shape}')
print(f'Columns: {list(df_with_clusters.columns)}')
Added purchase clusters info to the offline cleaned dataset:
Shape: (13081, 9)
Columns: ['CustomerID', 'InvoiceNo', 'Basket Price', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4', 'InvoiceDate']
{% endraw %} {% raw %}
from customer_segmentation_toolkit.load_split import split_by_invoice_date

# Splitting the new dataset (offline + cluster info) to train+test
df_offline_train, df_offline_test = split_by_invoice_date(df_with_clusters, TRAINTEST_DATE_SPLIT)
print(f'Splitted: train of shape {df_offline_train.shape} + test of shape {df_offline_test.shape}')
Splitted: train of shape (10054, 9) + test of shape (3027, 9)
{% endraw %} {% raw %}
OUTPUT = Path(f'../data/output/03_data_compute_description_keywords')
OUTPUT.mkdir(exist_ok=True, parents=True)

matrix.to_csv(f'{OUTPUT}/no_live_data__cleaned__keywords.csv', index=False)
df_offline_train.to_csv(f'{OUTPUT}/no_live_data__cleaned__purchase_clusters__train.csv', index=False)
df_offline_test.to_csv(f'{OUTPUT}/no_live_data__cleaned__purchase_clusters__test.csv', index=False)

pd.DataFrame(THRESHOLD, columns=['threshold']).to_csv(f'{OUTPUT}/threshold.csv', index=False)

Path(f'{OUTPUT}/n_purchase_clusters.txt').write_text(str(N_PURCHASE_CLUSTERS))
Path(f'{OUTPUT}/traintest_date_split.txt').write_text(str(TRAINTEST_DATE_SPLIT))

print(f'Output data saved to {OUTPUT}: {[p.name for p in Path(OUTPUT).iterdir()]}')
Output data saved to ../data/output/03_data_compute_description_keywords: ['no_live_data__cleaned__keywords.csv', 'no_live_data__cleaned__purchase_clusters__test.csv', 'threshold.csv', 'n_purchase_clusters.txt', 'traintest_date_split.txt', 'no_live_data__cleaned__purchase_clusters__train.csv']
{% endraw %}

04. Analyse customer categories

{% raw %}
from customer_segmentation_toolkit.load_split import load_data_csv

N_CUSTOMER_CLUSTERS = 11
SELECTED_CUSTOMERS_CATEG_THRESHOLD = 40

# Loading cleaned offline train dataset
DATA = '../data/output/03_data_compute_description_keywords'
N_PURCHASE_CLUSTERS = int(Path(f'{DATA}/n_purchase_clusters.txt').read_text().strip())

basket_price = load_data_csv(f'{DATA}/no_live_data__cleaned__purchase_clusters__train.csv')
print(f'Loaded purchase clusters data of shape: {basket_price.shape}')
print(basket_price.head())
print('...')
Loaded purchase clusters data of shape: (10054, 9)
  CustomerID  InvoiceNo  Basket Price  categ_0  categ_1  categ_2  categ_3  \
0      12347     537626        711.79   124.44    187.2   293.35    23.40   
1      12347     542237        475.39    38.25    130.5   169.20    84.34   
2      12347     549222        636.25    38.25    330.9   115.00    81.00   
3      12347     556201        382.52    19.90     74.4   168.76    41.40   
4      12348     539318        892.80   240.00    174.0     0.00   478.80   

   categ_4                   InvoiceDate  
0    83.40 2010-12-07 14:57:00.000001024  
1    53.10 2011-01-26 14:29:59.999999744  
2    71.10 2011-04-07 10:42:59.999999232  
3    78.06 2011-06-09 13:01:00.000000256  
4     0.00 2010-12-16 19:09:00.000000000  
...
{% endraw %} {% raw %}
from customer_segmentation_toolkit.analyse_customers import build_transactions_per_user

# Building transactions per user
transactions_per_user = build_transactions_per_user(basket_price, n_purchase_clusters=N_PURCHASE_CLUSTERS)
print(f'Built transactions per user, shape: {transactions_per_user.shape}')
print(transactions_per_user.head())
print('...')
Built transactions per user, shape: (3143, 13)
  CustomerID  count     min     max        mean      sum    categ_0  \
0      12347      4  382.52  711.79  551.487500  2205.95  10.011106   
1      12348      3  227.44  892.80  495.746667  1487.24  21.516366   
2      12350      1  334.40  334.40  334.400000   334.40  11.961722   
3      12352      4  144.35  840.30  360.370000  1441.48  78.356966   
4      12353      1   89.00   89.00   89.000000    89.00  67.078652   

     categ_1    categ_2    categ_3    categ_4  LastPurchase  FirstPurchase  
0  32.774995  33.831682  10.432693  12.949523            52            236  
1  32.543503   0.000000  45.940131   0.000000           117            227  
2  48.444976   0.000000  11.692584  27.900718           179            179  
3  11.479868   5.771846   0.707606   3.683714           131            165  
4  13.033708   0.000000   0.000000  19.887640            73             73  
...
{% endraw %} {% raw %}
from customer_segmentation_toolkit.analyse_customers import (
    plot_customers_pca,
    convert_customers_df_to_np,
    analyse_customers_pca,
)

# Analysing customers distribution via PCA
matrix = convert_customers_df_to_np(transactions_per_user, N_PURCHASE_CLUSTERS)
scaled_matrix, pca = analyse_customers_pca(matrix)

plot_customers_pca(matrix, pca)
2021-05-27T20:45:16.769256 image/svg+xml Matplotlib v3.4.2, https://matplotlib.org/
{% endraw %} {% raw %}
from customer_segmentation_toolkit.analyse_customers import compute_customer_clusters

# Computing customers clusters via Kmeans
clusters_clients = compute_customer_clusters(scaled_matrix, N_CUSTOMER_CLUSTERS)
print('Computed customers clusters via Kmeans:')
display(pd.Series(clusters_clients).value_counts())
Computed customers clusters via Kmeans:
2     1170
9      432
8      369
7      275
0      255
6      233
5      208
1      152
3       32
4       10
10       7
dtype: int64
{% endraw %} {% raw %}
from sklearn.metrics import silhouette_samples, silhouette_score
from customer_segmentation_toolkit.analyse_purchases import plot_silhouette

silhouette_avg = silhouette_score(scaled_matrix, clusters_clients)
sample_silhouette_values = silhouette_samples(scaled_matrix, clusters_clients)

# Plotting silhouette values
plot_silhouette(N_CUSTOMER_CLUSTERS, [-0.15, 0.55], len(scaled_matrix), sample_silhouette_values, clusters_clients)
2021-05-27T20:45:34.473446 image/svg+xml Matplotlib v3.4.2, https://matplotlib.org/
{% endraw %} {% raw %}
from customer_segmentation_toolkit.analyse_customers import plot_customer_categories

# Plotting customers categories
plot_customer_categories(scaled_matrix, clusters_clients, N_CUSTOMER_CLUSTERS)
/plain/github/mine/customer-segmentation-toolkit/customer_segmentation_toolkit/analyse_customers.py:157: UserWarning: Tight layout not applied. tight_layout cannot make axes height small enough to accommodate all axes decorations
  plt.tight_layout()
2021-05-27T20:45:35.857431 image/svg+xml Matplotlib v3.4.2, https://matplotlib.org/
{% endraw %} {% raw %}
from customer_segmentation_toolkit.analyse_customers import add_customer_clusters_info

# Constructing the result dataset
merged_df = add_customer_clusters_info(transactions_per_user, clusters_clients)
print(f'Constructed the result dataset:')
print(f'Shape: {merged_df.shape}')
print(f'Columns: {list(merged_df.columns)}')
Constructed the result dataset:
Shape: (3143, 14)
Columns: ['CustomerID', 'count', 'min', 'max', 'mean', 'sum', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4', 'LastPurchase', 'FirstPurchase', 'cluster']
{% endraw %} {% raw %}
from customer_segmentation_toolkit.analyse_customers import compute_aggregated_customer_clusters_info

# Constructing the aggregated cluster info dataset
selected_customers_df = compute_aggregated_customer_clusters_info(merged_df, N_PURCHASE_CLUSTERS, N_CUSTOMER_CLUSTERS,
                                                                  categ_threshold=SELECTED_CUSTOMERS_CATEG_THRESHOLD)
print('Constructed the aggregated cluster info:')
print(f'Shape: {selected_customers_df.shape}')
print(f'Columns: {list(selected_customers_df.columns)}')
Constructed the aggregated cluster info:
Shape: (11, 14)
Columns: ['cluster', 'count', 'min', 'max', 'mean', 'sum', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4', 'LastPurchase', 'FirstPurchase', 'size']
{% endraw %} {% raw %}
OUTPUT = Path(f'../data/output/04_data_analyse_customers')
OUTPUT.mkdir(exist_ok=True, parents=True)

selected_customers_df.to_csv(f'{OUTPUT}/no_live_data__cleaned__purchase_clusters__train__selected_customers_aggregated.csv', index=False)
merged_df.to_csv(f'{OUTPUT}/no_live_data__cleaned__purchase_clusters__train__customer_clusters.csv', index=False)

Path(f'{OUTPUT}/n_customer_clusters.txt').write_text(str(N_CUSTOMER_CLUSTERS))

print(f'Output data saved to {OUTPUT}: {[p.name for p in Path(OUTPUT).iterdir()]}')
Output data saved to ../data/output/04_data_analyse_customers: ['n_customer_clusters.txt', 'no_live_data__cleaned__purchase_clusters__train__selected_customers_aggregated.csv', 'no_live_data__cleaned__purchase_clusters__train__customer_clusters.csv']
{% endraw %}

05. Download dataset and use it for training

{% raw %}
from customer_segmentation_toolkit.data_zoo import download_data_csv
from sklearn.model_selection import train_test_split

# Download dataset from the data_zoo:
csv = 'no_live_data__cleaned__purchase_clusters__train__customer_clusters.csv'
selected_customers: pd.DataFrame = download_data_csv(f'data/output/04_data_analyse_customers/{csv}')

X = selected_customers[['mean', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4' ]]
Y = selected_customers['cluster']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape
---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
<ipython-input-28-03ec75640bb1> in <module>
      4 # Download dataset from the data_zoo:
      5 csv = 'no_live_data__cleaned__purchase_clusters__train__customer_clusters.csv'
----> 6 selected_customers: pd.DataFrame = download_data_csv(f'data/output/04_data_analyse_customers/{csv}')
      7 
      8 X = selected_customers[['mean', 'categ_0', 'categ_1', 'categ_2', 'categ_3', 'categ_4' ]]

/plain/github/mine/customer-segmentation-toolkit/customer_segmentation_toolkit/data_zoo.py in download_data_csv(path_relative, base_url)
     19     url = f'{base_url}/{normpath(path_relative)}'
     20     logging.info(f"Downloading dataset '{url}'")
---> 21     return pd.read_csv(url)

~/.pyenv/versions/3.7.6/lib/python3.7/site-packages/pandas/io/parsers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    608     kwds.update(kwds_defaults)
    609 
--> 610     return _read(filepath_or_buffer, kwds)
    611 
    612 

~/.pyenv/versions/3.7.6/lib/python3.7/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    460 
    461     # Create the parser.
--> 462     parser = TextFileReader(filepath_or_buffer, **kwds)
    463 
    464     if chunksize or iterator:

~/.pyenv/versions/3.7.6/lib/python3.7/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    817             self.options["has_index_names"] = kwds["has_index_names"]
    818 
--> 819         self._engine = self._make_engine(self.engine)
    820 
    821     def close(self):

~/.pyenv/versions/3.7.6/lib/python3.7/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
   1048             )
   1049         # error: Too many arguments for "ParserBase"
-> 1050         return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]
   1051 
   1052     def _failover_to_python(self):

~/.pyenv/versions/3.7.6/lib/python3.7/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1865 
   1866         # open handles
-> 1867         self._open_handles(src, kwds)
   1868         assert self.handles is not None
   1869         for key in ("storage_options", "encoding", "memory_map", "compression"):

~/.pyenv/versions/3.7.6/lib/python3.7/site-packages/pandas/io/parsers.py in _open_handles(self, src, kwds)
   1366             compression=kwds.get("compression", None),
   1367             memory_map=kwds.get("memory_map", False),
-> 1368             storage_options=kwds.get("storage_options", None),
   1369         )
   1370 

~/.pyenv/versions/3.7.6/lib/python3.7/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    561         compression=compression,
    562         mode=mode,
--> 563         storage_options=storage_options,
    564     )
    565 

~/.pyenv/versions/3.7.6/lib/python3.7/site-packages/pandas/io/common.py in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
    287                 "storage_options passed with file object or non-fsspec file path"
    288             )
--> 289         req = urlopen(filepath_or_buffer)
    290         content_encoding = req.headers.get("Content-Encoding", None)
    291         if content_encoding == "gzip":

~/.pyenv/versions/3.7.6/lib/python3.7/site-packages/pandas/io/common.py in urlopen(*args, **kwargs)
    193     import urllib.request
    194 
--> 195     return urllib.request.urlopen(*args, **kwargs)
    196 
    197 

~/.pyenv/versions/3.7.6/lib/python3.7/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

~/.pyenv/versions/3.7.6/lib/python3.7/urllib/request.py in open(self, fullurl, data, timeout)
    529         for processor in self.process_response.get(protocol, []):
    530             meth = getattr(processor, meth_name)
--> 531             response = meth(req, response)
    532 
    533         return response

~/.pyenv/versions/3.7.6/lib/python3.7/urllib/request.py in http_response(self, request, response)
    639         if not (200 <= code < 300):
    640             response = self.parent.error(
--> 641                 'http', request, response, code, msg, hdrs)
    642 
    643         return response

~/.pyenv/versions/3.7.6/lib/python3.7/urllib/request.py in error(self, proto, *args)
    567         if http_err:
    568             args = (dict, 'default', 'http_error_default') + orig_args
--> 569             return self._call_chain(*args)
    570 
    571 # XXX probably also want an abstract factory that knows when it makes

~/.pyenv/versions/3.7.6/lib/python3.7/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    501         for handler in handlers:
    502             func = getattr(handler, meth_name)
--> 503             result = func(*args)
    504             if result is not None:
    505                 return result

~/.pyenv/versions/3.7.6/lib/python3.7/urllib/request.py in http_error_default(self, req, fp, code, msg, hdrs)
    647 class HTTPDefaultErrorHandler(BaseHandler):
    648     def http_error_default(self, req, fp, code, msg, hdrs):
--> 649         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    650 
    651 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 404: Not Found
{% endraw %}